import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
ppg = pd.read_csv('../../from_2100/ppg_corrosion/ppg_corrosion.csv')
ppg.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2000 entries, 0 to 1999 Data columns (total 79 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 X01 2000 non-null float64 1 X02 2000 non-null float64 2 X03 2000 non-null float64 3 X04 2000 non-null float64 4 X05 2000 non-null float64 5 X06 2000 non-null float64 6 X07 2000 non-null float64 7 X08 2000 non-null float64 8 X09 2000 non-null float64 9 X10 2000 non-null float64 10 X11 2000 non-null float64 11 X12 2000 non-null float64 12 X13 2000 non-null float64 13 X14 2000 non-null float64 14 X15 2000 non-null float64 15 X16 2000 non-null int64 16 X17 2000 non-null int64 17 X18 2000 non-null int64 18 X19 2000 non-null float64 19 X20 2000 non-null float64 20 X21 2000 non-null float64 21 X22 2000 non-null float64 22 X23 2000 non-null float64 23 X24 2000 non-null float64 24 X25 2000 non-null float64 25 X26 2000 non-null float64 26 X27 2000 non-null float64 27 X28 2000 non-null float64 28 X29 2000 non-null float64 29 X30 2000 non-null float64 30 X31 2000 non-null float64 31 X32 2000 non-null float64 32 X33 2000 non-null float64 33 X34 2000 non-null float64 34 X35 2000 non-null float64 35 X36 2000 non-null float64 36 X37 2000 non-null float64 37 X38 2000 non-null float64 38 X39 2000 non-null float64 39 X40 2000 non-null float64 40 X41 2000 non-null float64 41 X42 2000 non-null float64 42 X43 2000 non-null float64 43 X44 2000 non-null float64 44 X45 2000 non-null float64 45 X46 2000 non-null float64 46 X47 2000 non-null float64 47 X48 2000 non-null float64 48 X49 2000 non-null float64 49 X50 2000 non-null float64 50 X51 2000 non-null float64 51 X52 2000 non-null float64 52 X53 2000 non-null float64 53 X54 2000 non-null float64 54 X55 2000 non-null float64 55 X56 2000 non-null float64 56 X57 2000 non-null float64 57 X58 2000 non-null float64 58 X59 2000 non-null float64 59 X60 2000 non-null float64 60 X61 2000 non-null float64 61 X62 2000 non-null float64 62 X63 2000 non-null float64 63 X64 2000 non-null float64 64 X65 2000 non-null float64 65 X66 2000 non-null float64 66 X67 2000 non-null float64 67 X68 2000 non-null float64 68 X69 2000 non-null float64 69 X70 2000 non-null float64 70 X71 2000 non-null float64 71 X72 2000 non-null float64 72 X73 2000 non-null float64 73 X74 2000 non-null float64 74 X75 2000 non-null float64 75 X76 2000 non-null float64 76 X77 2000 non-null float64 77 X78 2000 non-null float64 78 corrosion 2000 non-null object dtypes: float64(75), int64(3), object(1) memory usage: 1.2+ MB
ppg.shape
(2000, 79)
ppg.columns
Index(['X01', 'X02', 'X03', 'X04', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10',
'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20',
'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30',
'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40',
'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50',
'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60',
'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70',
'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78', 'corrosion'],
dtype='object')
ppg.dtypes
X01 float64
X02 float64
X03 float64
X04 float64
X05 float64
...
X75 float64
X76 float64
X77 float64
X78 float64
corrosion object
Length: 79, dtype: object
ppg.dtypes.value_counts()
float64 75 int64 3 object 1 Name: count, dtype: int64
ppg.select_dtypes('number').columns
Index(['X01', 'X02', 'X03', 'X04', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10',
'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20',
'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30',
'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40',
'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50',
'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60',
'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70',
'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78'],
dtype='object')
input_names = ppg.select_dtypes('number').columns.to_list()
input_names[:7]
['X01', 'X02', 'X03', 'X04', 'X05', 'X06', 'X07']
ppg.nunique()
X01 1998
X02 1994
X03 1996
X04 2000
X05 2000
...
X75 2000
X76 2000
X77 2000
X78 2000
corrosion 2
Length: 79, dtype: int64
ppg.corrosion.value_counts()
corrosion yes 1000 no 1000 Name: count, dtype: int64
ppg.isna().sum()
X01 0
X02 0
X03 0
X04 0
X05 0
..
X75 0
X76 0
X77 0
X78 0
corrosion 0
Length: 79, dtype: int64
ppg.isna().sum().value_counts()
0 79 Name: count, dtype: int64
sns.catplot(data = ppg, x='corrosion', kind='count')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
input_names
['X01', 'X02', 'X03', 'X04', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X36', 'X37', 'X38', 'X39', 'X40', 'X41', 'X42', 'X43', 'X44', 'X45', 'X46', 'X47', 'X48', 'X49', 'X50', 'X51', 'X52', 'X53', 'X54', 'X55', 'X56', 'X57', 'X58', 'X59', 'X60', 'X61', 'X62', 'X63', 'X64', 'X65', 'X66', 'X67', 'X68', 'X69', 'X70', 'X71', 'X72', 'X73', 'X74', 'X75', 'X76', 'X77', 'X78']
sns.displot(data = ppg, x='X01', kind='hist')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.displot(data = ppg, x='X02', kind='hist')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
We have a lot of typing to do to create ALL marginal distribution plots!
We want to know how the distribution of each input relates to the unique values of the categorical output!
sns.catplot(data = ppg, x='corrosion', y='X01', kind='box')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.catplot(data = ppg, x='corrosion', y='X01', kind='violin')
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
It would be nice to show the conditional distribution on top of each other! That's what the CONDITIONAL KDE plot does!
sns.displot(data = ppg, x='X01', hue='corrosion', kind='kde', common_norm=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
How does the AVERAGE input change across the categories of the binary output?
sns.catplot(data = ppg, x='corrosion', y='X01', kind='point', join=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
We have a lot of typing to do to complete the exploration for ALL inputs!
You might be tempted to use a FOR LOOP to manage the iteration across all inputs...but the FOR LOOP is not necessary!
The FOR LOOP can lead you into trouble that you may not realize! Because you create the FOR LOOP after some practice on a few variables!
Instead...lets us RESHAPING to reshape the WIDE FORMAT data into a LONG FORMAT data set!
ppg
| X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | X10 | ... | X70 | X71 | X72 | X73 | X74 | X75 | X76 | X77 | X78 | corrosion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52.4988 | 22.4352 | 15.7248 | 6.300127 | 2.130775 | 1.396948 | 0.169325 | 0.165502 | -0.435505 | 2.164313 | ... | 114.475601 | 395.783685 | 0.5492 | 0.073049 | 888.703569 | 0.002911 | 515.732996 | 57.139364 | 2211.942462 | yes |
| 1 | 163.6560 | 161.7176 | 164.0408 | 8.108419 | 13.026321 | 15.191390 | 2.292762 | -2.654124 | -2.700556 | 8.429188 | ... | 111.268083 | 1116.853313 | 0.7908 | 0.050112 | 24964.871523 | 0.001674 | 20294.367898 | 39.680852 | 16779.045018 | no |
| 2 | 218.9088 | 217.9228 | 212.4132 | 5.880993 | 7.158718 | 7.370378 | -0.551242 | -0.718916 | -0.860859 | 4.751198 | ... | 101.217219 | 1611.247241 | 0.9060 | 0.043729 | 45281.834879 | 0.001453 | 40845.652236 | 34.635325 | 10159.520530 | no |
| 3 | 45.7112 | 29.8620 | 26.9060 | 6.656470 | 3.076322 | 2.237222 | 0.380476 | 0.434857 | 0.033068 | 2.156228 | ... | 121.376398 | 652.679503 | 0.6440 | 0.062262 | 1126.320497 | 0.002554 | 796.709264 | 48.727540 | 1968.103727 | yes |
| 4 | 244.6048 | 141.9512 | 129.5172 | 5.024004 | 16.503552 | 19.441422 | -1.712171 | 1.971901 | 1.974853 | 7.476739 | ... | 68.184074 | 1072.486039 | 0.7736 | 0.051221 | 28473.791624 | 0.001706 | 23440.503221 | 40.563108 | 20032.938987 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 164.0856 | 137.2884 | 113.3964 | 36.614804 | 54.923099 | 54.589736 | -0.593199 | -0.636593 | -0.495143 | 2.268658 | ... | 30.718405 | 1812.093299 | 0.9432 | 0.042067 | 21328.518236 | 0.001456 | 19595.830121 | 33.269305 | 3839.905004 | yes |
| 1996 | 42.5652 | 24.2252 | 21.2204 | 5.840698 | 2.702755 | 2.306474 | 0.335472 | 0.537995 | 0.313693 | 2.374385 | ... | 127.580076 | 611.538462 | 0.6344 | 0.063441 | 842.618537 | 0.002734 | 585.553772 | 49.465321 | 1282.624212 | yes |
| 1997 | 80.9216 | 71.1844 | 67.2952 | 25.794051 | 38.100226 | 41.485642 | -0.515049 | -0.302666 | -0.216146 | 1.697923 | ... | 45.252311 | 1452.745841 | 0.8656 | 0.046187 | 6109.939926 | 0.001883 | 5280.458892 | 36.252013 | 3181.100739 | yes |
| 1998 | 136.0328 | 62.9732 | 33.6664 | 8.161699 | 6.332241 | 4.890635 | 1.262157 | 1.915884 | 2.202250 | 6.302768 | ... | 114.696552 | 1460.018851 | 0.8700 | 0.045659 | 6443.152184 | 0.001618 | 5719.772883 | 36.068557 | 2061.489655 | yes |
| 1999 | 126.4380 | 119.4280 | 127.2596 | 13.291582 | 13.517367 | 12.350425 | -0.764488 | -1.018323 | -0.429107 | 4.132570 | ... | 70.186795 | 1871.270790 | 0.9572 | 0.041436 | 14538.563310 | 0.001420 | 13644.766405 | 32.782707 | 1569.125784 | yes |
2000 rows × 79 columns
Let's reshape by STACKING all INPUT COLUMNS on top of each other. The Pandas syntax for this operation is known as MELTING!!!!
ppg.melt()
| variable | value | |
|---|---|---|
| 0 | X01 | 52.4988 |
| 1 | X01 | 163.656 |
| 2 | X01 | 218.9088 |
| 3 | X01 | 45.7112 |
| 4 | X01 | 244.6048 |
| ... | ... | ... |
| 157995 | corrosion | yes |
| 157996 | corrosion | yes |
| 157997 | corrosion | yes |
| 157998 | corrosion | yes |
| 157999 | corrosion | yes |
158000 rows × 2 columns
ppg.melt().info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 158000 entries, 0 to 157999 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 variable 158000 non-null object 1 value 158000 non-null object dtypes: object(2) memory usage: 2.4+ MB
Instead, we need to specify which variables will NOT be stacked or GATHERED UP and which variables WILL be stacked together!
ppg.melt(id_vars=['corrosion'])
| corrosion | variable | value | |
|---|---|---|---|
| 0 | yes | X01 | 52.498800 |
| 1 | no | X01 | 163.656000 |
| 2 | no | X01 | 218.908800 |
| 3 | yes | X01 | 45.711200 |
| 4 | no | X01 | 244.604800 |
| ... | ... | ... | ... |
| 155995 | yes | X78 | 3839.905004 |
| 155996 | yes | X78 | 1282.624212 |
| 155997 | yes | X78 | 3181.100739 |
| 155998 | yes | X78 | 2061.489655 |
| 155999 | yes | X78 | 1569.125784 |
156000 rows × 3 columns
ppg.melt(id_vars=['corrosion']).info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 156000 entries, 0 to 155999 Data columns (total 3 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 corrosion 156000 non-null object 1 variable 156000 non-null object 2 value 156000 non-null float64 dtypes: float64(1), object(2) memory usage: 3.6+ MB
ppg.melt(id_vars=['corrosion']).variable.value_counts()
variable
X01 2000
X50 2000
X57 2000
X56 2000
X55 2000
...
X26 2000
X25 2000
X24 2000
X23 2000
X78 2000
Name: count, Length: 78, dtype: int64
ppg.melt(id_vars=['corrosion'], value_vars=input_names, ignore_index=True)
| corrosion | variable | value | |
|---|---|---|---|
| 0 | yes | X01 | 52.498800 |
| 1 | no | X01 | 163.656000 |
| 2 | no | X01 | 218.908800 |
| 3 | yes | X01 | 45.711200 |
| 4 | no | X01 | 244.604800 |
| ... | ... | ... | ... |
| 155995 | yes | X78 | 3839.905004 |
| 155996 | yes | X78 | 1282.624212 |
| 155997 | yes | X78 | 3181.100739 |
| 155998 | yes | X78 | 2061.489655 |
| 155999 | yes | X78 | 1569.125784 |
156000 rows × 3 columns
ppg
| X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | X10 | ... | X70 | X71 | X72 | X73 | X74 | X75 | X76 | X77 | X78 | corrosion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52.4988 | 22.4352 | 15.7248 | 6.300127 | 2.130775 | 1.396948 | 0.169325 | 0.165502 | -0.435505 | 2.164313 | ... | 114.475601 | 395.783685 | 0.5492 | 0.073049 | 888.703569 | 0.002911 | 515.732996 | 57.139364 | 2211.942462 | yes |
| 1 | 163.6560 | 161.7176 | 164.0408 | 8.108419 | 13.026321 | 15.191390 | 2.292762 | -2.654124 | -2.700556 | 8.429188 | ... | 111.268083 | 1116.853313 | 0.7908 | 0.050112 | 24964.871523 | 0.001674 | 20294.367898 | 39.680852 | 16779.045018 | no |
| 2 | 218.9088 | 217.9228 | 212.4132 | 5.880993 | 7.158718 | 7.370378 | -0.551242 | -0.718916 | -0.860859 | 4.751198 | ... | 101.217219 | 1611.247241 | 0.9060 | 0.043729 | 45281.834879 | 0.001453 | 40845.652236 | 34.635325 | 10159.520530 | no |
| 3 | 45.7112 | 29.8620 | 26.9060 | 6.656470 | 3.076322 | 2.237222 | 0.380476 | 0.434857 | 0.033068 | 2.156228 | ... | 121.376398 | 652.679503 | 0.6440 | 0.062262 | 1126.320497 | 0.002554 | 796.709264 | 48.727540 | 1968.103727 | yes |
| 4 | 244.6048 | 141.9512 | 129.5172 | 5.024004 | 16.503552 | 19.441422 | -1.712171 | 1.971901 | 1.974853 | 7.476739 | ... | 68.184074 | 1072.486039 | 0.7736 | 0.051221 | 28473.791624 | 0.001706 | 23440.503221 | 40.563108 | 20032.938987 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 164.0856 | 137.2884 | 113.3964 | 36.614804 | 54.923099 | 54.589736 | -0.593199 | -0.636593 | -0.495143 | 2.268658 | ... | 30.718405 | 1812.093299 | 0.9432 | 0.042067 | 21328.518236 | 0.001456 | 19595.830121 | 33.269305 | 3839.905004 | yes |
| 1996 | 42.5652 | 24.2252 | 21.2204 | 5.840698 | 2.702755 | 2.306474 | 0.335472 | 0.537995 | 0.313693 | 2.374385 | ... | 127.580076 | 611.538462 | 0.6344 | 0.063441 | 842.618537 | 0.002734 | 585.553772 | 49.465321 | 1282.624212 | yes |
| 1997 | 80.9216 | 71.1844 | 67.2952 | 25.794051 | 38.100226 | 41.485642 | -0.515049 | -0.302666 | -0.216146 | 1.697923 | ... | 45.252311 | 1452.745841 | 0.8656 | 0.046187 | 6109.939926 | 0.001883 | 5280.458892 | 36.252013 | 3181.100739 | yes |
| 1998 | 136.0328 | 62.9732 | 33.6664 | 8.161699 | 6.332241 | 4.890635 | 1.262157 | 1.915884 | 2.202250 | 6.302768 | ... | 114.696552 | 1460.018851 | 0.8700 | 0.045659 | 6443.152184 | 0.001618 | 5719.772883 | 36.068557 | 2061.489655 | yes |
| 1999 | 126.4380 | 119.4280 | 127.2596 | 13.291582 | 13.517367 | 12.350425 | -0.764488 | -1.018323 | -0.429107 | 4.132570 | ... | 70.186795 | 1871.270790 | 0.9572 | 0.041436 | 14538.563310 | 0.001420 | 13644.766405 | 32.782707 | 1569.125784 | yes |
2000 rows × 79 columns
I like to include a ROWID column to preserve the original row number that each observation comes from.
ppg.reset_index()
| index | X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | ... | X70 | X71 | X72 | X73 | X74 | X75 | X76 | X77 | X78 | corrosion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 52.4988 | 22.4352 | 15.7248 | 6.300127 | 2.130775 | 1.396948 | 0.169325 | 0.165502 | -0.435505 | ... | 114.475601 | 395.783685 | 0.5492 | 0.073049 | 888.703569 | 0.002911 | 515.732996 | 57.139364 | 2211.942462 | yes |
| 1 | 1 | 163.6560 | 161.7176 | 164.0408 | 8.108419 | 13.026321 | 15.191390 | 2.292762 | -2.654124 | -2.700556 | ... | 111.268083 | 1116.853313 | 0.7908 | 0.050112 | 24964.871523 | 0.001674 | 20294.367898 | 39.680852 | 16779.045018 | no |
| 2 | 2 | 218.9088 | 217.9228 | 212.4132 | 5.880993 | 7.158718 | 7.370378 | -0.551242 | -0.718916 | -0.860859 | ... | 101.217219 | 1611.247241 | 0.9060 | 0.043729 | 45281.834879 | 0.001453 | 40845.652236 | 34.635325 | 10159.520530 | no |
| 3 | 3 | 45.7112 | 29.8620 | 26.9060 | 6.656470 | 3.076322 | 2.237222 | 0.380476 | 0.434857 | 0.033068 | ... | 121.376398 | 652.679503 | 0.6440 | 0.062262 | 1126.320497 | 0.002554 | 796.709264 | 48.727540 | 1968.103727 | yes |
| 4 | 4 | 244.6048 | 141.9512 | 129.5172 | 5.024004 | 16.503552 | 19.441422 | -1.712171 | 1.971901 | 1.974853 | ... | 68.184074 | 1072.486039 | 0.7736 | 0.051221 | 28473.791624 | 0.001706 | 23440.503221 | 40.563108 | 20032.938987 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 1995 | 164.0856 | 137.2884 | 113.3964 | 36.614804 | 54.923099 | 54.589736 | -0.593199 | -0.636593 | -0.495143 | ... | 30.718405 | 1812.093299 | 0.9432 | 0.042067 | 21328.518236 | 0.001456 | 19595.830121 | 33.269305 | 3839.905004 | yes |
| 1996 | 1996 | 42.5652 | 24.2252 | 21.2204 | 5.840698 | 2.702755 | 2.306474 | 0.335472 | 0.537995 | 0.313693 | ... | 127.580076 | 611.538462 | 0.6344 | 0.063441 | 842.618537 | 0.002734 | 585.553772 | 49.465321 | 1282.624212 | yes |
| 1997 | 1997 | 80.9216 | 71.1844 | 67.2952 | 25.794051 | 38.100226 | 41.485642 | -0.515049 | -0.302666 | -0.216146 | ... | 45.252311 | 1452.745841 | 0.8656 | 0.046187 | 6109.939926 | 0.001883 | 5280.458892 | 36.252013 | 3181.100739 | yes |
| 1998 | 1998 | 136.0328 | 62.9732 | 33.6664 | 8.161699 | 6.332241 | 4.890635 | 1.262157 | 1.915884 | 2.202250 | ... | 114.696552 | 1460.018851 | 0.8700 | 0.045659 | 6443.152184 | 0.001618 | 5719.772883 | 36.068557 | 2061.489655 | yes |
| 1999 | 1999 | 126.4380 | 119.4280 | 127.2596 | 13.291582 | 13.517367 | 12.350425 | -0.764488 | -1.018323 | -0.429107 | ... | 70.186795 | 1871.270790 | 0.9572 | 0.041436 | 14538.563310 | 0.001420 | 13644.766405 | 32.782707 | 1569.125784 | yes |
2000 rows × 80 columns
ppg.reset_index().rename(columns={'index': 'rowid'})
| rowid | X01 | X02 | X03 | X04 | X05 | X06 | X07 | X08 | X09 | ... | X70 | X71 | X72 | X73 | X74 | X75 | X76 | X77 | X78 | corrosion | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 52.4988 | 22.4352 | 15.7248 | 6.300127 | 2.130775 | 1.396948 | 0.169325 | 0.165502 | -0.435505 | ... | 114.475601 | 395.783685 | 0.5492 | 0.073049 | 888.703569 | 0.002911 | 515.732996 | 57.139364 | 2211.942462 | yes |
| 1 | 1 | 163.6560 | 161.7176 | 164.0408 | 8.108419 | 13.026321 | 15.191390 | 2.292762 | -2.654124 | -2.700556 | ... | 111.268083 | 1116.853313 | 0.7908 | 0.050112 | 24964.871523 | 0.001674 | 20294.367898 | 39.680852 | 16779.045018 | no |
| 2 | 2 | 218.9088 | 217.9228 | 212.4132 | 5.880993 | 7.158718 | 7.370378 | -0.551242 | -0.718916 | -0.860859 | ... | 101.217219 | 1611.247241 | 0.9060 | 0.043729 | 45281.834879 | 0.001453 | 40845.652236 | 34.635325 | 10159.520530 | no |
| 3 | 3 | 45.7112 | 29.8620 | 26.9060 | 6.656470 | 3.076322 | 2.237222 | 0.380476 | 0.434857 | 0.033068 | ... | 121.376398 | 652.679503 | 0.6440 | 0.062262 | 1126.320497 | 0.002554 | 796.709264 | 48.727540 | 1968.103727 | yes |
| 4 | 4 | 244.6048 | 141.9512 | 129.5172 | 5.024004 | 16.503552 | 19.441422 | -1.712171 | 1.971901 | 1.974853 | ... | 68.184074 | 1072.486039 | 0.7736 | 0.051221 | 28473.791624 | 0.001706 | 23440.503221 | 40.563108 | 20032.938987 | no |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 1995 | 1995 | 164.0856 | 137.2884 | 113.3964 | 36.614804 | 54.923099 | 54.589736 | -0.593199 | -0.636593 | -0.495143 | ... | 30.718405 | 1812.093299 | 0.9432 | 0.042067 | 21328.518236 | 0.001456 | 19595.830121 | 33.269305 | 3839.905004 | yes |
| 1996 | 1996 | 42.5652 | 24.2252 | 21.2204 | 5.840698 | 2.702755 | 2.306474 | 0.335472 | 0.537995 | 0.313693 | ... | 127.580076 | 611.538462 | 0.6344 | 0.063441 | 842.618537 | 0.002734 | 585.553772 | 49.465321 | 1282.624212 | yes |
| 1997 | 1997 | 80.9216 | 71.1844 | 67.2952 | 25.794051 | 38.100226 | 41.485642 | -0.515049 | -0.302666 | -0.216146 | ... | 45.252311 | 1452.745841 | 0.8656 | 0.046187 | 6109.939926 | 0.001883 | 5280.458892 | 36.252013 | 3181.100739 | yes |
| 1998 | 1998 | 136.0328 | 62.9732 | 33.6664 | 8.161699 | 6.332241 | 4.890635 | 1.262157 | 1.915884 | 2.202250 | ... | 114.696552 | 1460.018851 | 0.8700 | 0.045659 | 6443.152184 | 0.001618 | 5719.772883 | 36.068557 | 2061.489655 | yes |
| 1999 | 1999 | 126.4380 | 119.4280 | 127.2596 | 13.291582 | 13.517367 | 12.350425 | -0.764488 | -1.018323 | -0.429107 | ... | 70.186795 | 1871.270790 | 0.9572 | 0.041436 | 14538.563310 | 0.001420 | 13644.766405 | 32.782707 | 1569.125784 | yes |
2000 rows × 80 columns
lf = ppg.reset_index().rename(columns={'index': 'rowid'}).\
melt(id_vars=['rowid', 'corrosion'], value_vars=input_names, ignore_index=True)
lf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 156000 entries, 0 to 155999 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rowid 156000 non-null int64 1 corrosion 156000 non-null object 2 variable 156000 non-null object 3 value 156000 non-null float64 dtypes: float64(1), int64(1), object(2) memory usage: 4.8+ MB
lf.rowid.value_counts().sort_index()
rowid
0 78
1 78
2 78
3 78
4 78
..
1995 78
1996 78
1997 78
1998 78
1999 78
Name: count, Length: 2000, dtype: int64
len( input_names )
78
lf.loc[ lf.rowid==0, : ]
| rowid | corrosion | variable | value | |
|---|---|---|---|---|
| 0 | 0 | yes | X01 | 52.498800 |
| 2000 | 0 | yes | X02 | 22.435200 |
| 4000 | 0 | yes | X03 | 15.724800 |
| 6000 | 0 | yes | X04 | 6.300127 |
| 8000 | 0 | yes | X05 | 2.130775 |
| ... | ... | ... | ... | ... |
| 146000 | 0 | yes | X74 | 888.703569 |
| 148000 | 0 | yes | X75 | 0.002911 |
| 150000 | 0 | yes | X76 | 515.732996 |
| 152000 | 0 | yes | X77 | 57.139364 |
| 154000 | 0 | yes | X78 | 2211.942462 |
78 rows × 4 columns
Now we can use the SEABORN facets to create separate subplots for every input in the problem!
sns.displot(data = lf, x='value', kind='hist')
plt.show()
KeyboardInterrupt
Error in callback <function _draw_all_if_interactive at 0x000002691692E550> (for post_execute):
KeyboardInterrupt
Error in callback <function flush_figures at 0x000002691BC19EE0> (for post_execute):
KeyboardInterrupt
sns.displot(data = lf, x='value', col='variable', kind='hist',
common_bins=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.displot(data = lf, x='value', col='variable', kind='hist',
col_wrap=8,
common_bins=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
The inputs have different SCALES in this application.
sns.catplot(data = ppg, kind='box', aspect=3)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
We need each FACET to have its own SCALES or we need FREE axes!!!
sns.displot(data = lf, x='value', col='variable', kind='hist',
col_wrap=8,
common_bins=False,
facet_kws={'sharex': False, 'sharey': False})
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
The long format data are more flexible because we did NOT stack the categorical variable! We can now condition each input distribution ON the categorical!
sns.displot(data = lf, x='value', col='variable', kind='kde',
hue='corrosion', common_norm=False,
col_wrap=8,
facet_kws={'sharex': False, 'sharey': False})
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
sns.catplot(data = lf, x='corrosion', y='value', col='variable', kind='box',
col_wrap=8,
sharex=False, sharey=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\categorical.py:3201: UserWarning: Setting `sharex=False` with `color=None` may cause different levels of the `x` variable to share colors. This will change in a future version.
warnings.warn(msg.format("sharex", "x"), UserWarning)
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight
self._figure.tight_layout(*args, **kwargs)
sns.catplot(data = lf, x='corrosion', y='value', kind='point', join=False,
col='variable', col_wrap=8,
sharey=False)
plt.show()
C:\Users\jyurk\anaconda3\envs\cmpinf2120\lib\site-packages\seaborn\axisgrid.py:118: UserWarning: The figure layout has changed to tight self._figure.tight_layout(*args, **kwargs)
This notebook reviewed how to reshape using the .melt() method from WIDE to LONG format.
This notebook demonstrated why it is so essential to first check the SCALES and MAGNITUDE of the variables before creating separate facets for all variables.
This notebook showed how to use facets to visualize marginal and conditional distributions for MANY variables in a data set within a single figure.